rm(list = ls())

# # packages only need to be installed once
# install.packages("stm")
# install.packages("wordcloud")
library(stm)
library(wordcloud)

# load the stm models
load('../../0_preparation/1_topicvalidation/models/stm10k1it.Rdata')
load('../../0_preparation/1_topicvalidation/models/stm10k.Rdata')
load('../../0_preparation/1_topicvalidation/models/stm50k.Rdata')
load('../../0_preparation/1_topicvalidation/models/stm100k.Rdata')
load('../../0_preparation/1_topicvalidation/models/stm500k.Rdata')

### replicate Figure SI25 (Word Mass: Beta)
SortedTopics <- stm10k$beta$logbeta[[1]]
for (i in 1:nrow(stm10k$beta$logbeta[[1]])){
  SortedTopics[i,] <- sort(stm10k$beta$logbeta[[1]][i,], decreasing = T)
}
pdf("plots/figureSI25.pdf")
plot(exp(SortedTopics[1,1:30]), 
     ylab = "Beta", xlab = "Top 30 Words",
     ylim = c(0, 0.045), type = "l")
for(i in 2:10){
  lines(exp(SortedTopics[i,1:30]))
}
dev.off()

### replicate Figure SI26 (Word Mass: Log Beta)
pdf("plots/figureSI26.pdf")
plot(SortedTopics[1,1:30], 
     ylab = "Log Beta", xlab = "Top 30 Words",
     ylim = c(-6, -2), type = "l")
for(i in 2:10){
  lines(SortedTopics[i,1:30])
}
dev.off()

### replicate Figure SI27 (Word versus Mass)
pdf("plots/figureSI27.pdf", width = 14, height = 14)
par(mfrow= c(2, 2))
# stm10k
modexam <- stm10k
sortbeta <- t(apply(exp(modexam$beta$logbeta[[1]]), 1, sort, decreasing = TRUE))
avgsortbeta <- colMeans(sortbeta)
cumbeta <- cumsum(avgsortbeta)
plot(NULL, ylim=c(0, 1), xlim = c(0, 1700), main = "STM 10 topics (average)",
     xlab = "Top n words", ylab = "Mass covered", cex.lab = 1, axes = F)
axis(side = 1, at = seq(0, 1500, by = 500), col.ticks = NA, cex.axis = 1)
axis(side = 2, at = seq(0, 1, by = 0.2), col.ticks = NA, cex.axis = 1)
points(cumbeta[1:1500], type = "l")
segments(x0 = c(20, 50, 200), y0 = 0, x1 = c(20, 50, 200), y1 = cumbeta[c(20, 50, 200)], col = "red")
segments(x0 = 0, y0 = cumbeta[c(20, 50, 200)], x1 = c(20, 50, 200), y1 = cumbeta[c(20, 50, 200)], col = "red")
text(c(20, 50, 200), -0.0025, labels = c(20, 50, 200), cex = .8, col = "red")
text(0, round(cumbeta[c(20, 50, 200)], 2)+0.03, labels = round(cumbeta[c(20, 50, 200)], 2), cex = .8, col = "red")
segments(x0 = which.min(abs(0.9 - cumbeta)), y0 = 0, x1 = which.min(abs(0.9 - cumbeta)), y1 = 0.9, col = "blue")
segments(x0 = 0, y0 = 0.9, x1 = which.min(abs(0.9 - cumbeta)), y1 = 0.9, col = "blue")
text(which.min(abs(0.9 - cumbeta)), -0.0025, labels = which.min(abs(0.9 - cumbeta)), cex = .8, col = "blue")
text(0, 0.9+0.03, labels = 0.9, cex = .8, col = "blue") 
# stm50k
modexam <- stm50k
sortbeta <- t(apply(exp(modexam$beta$logbeta[[1]]), 1, sort, decreasing = TRUE))
avgsortbeta <- colMeans(sortbeta)
cumbeta <- cumsum(avgsortbeta)
plot(NULL, ylim=c(0, 1), xlim = c(0, 1700), main = "STM 50 topics (average)",
     xlab = "Top n words", ylab = "Mass covered", cex.lab = 1, axes = F)
axis(side = 1, at = seq(0, 1500, by = 500), col.ticks = NA, cex.axis = 1)
axis(side = 2, at = seq(0, 1, by = 0.2), col.ticks = NA, cex.axis = 1)
points(cumbeta[1:1500], type = "l")
segments(x0 = c(20, 50, 200), y0 = 0, x1 = c(20, 50, 200), y1 = cumbeta[c(20, 50, 200)], col = "red")
segments(x0 = 0, y0 = cumbeta[c(20, 50, 200)], x1 = c(20, 50, 200), y1 = cumbeta[c(20, 50, 200)], col = "red")
text(c(20, 50, 200), -0.0025, labels = c(20, 50, 200), cex = .8, col = "red")
text(0, round(cumbeta[c(20, 50, 200)], 2)+0.03, labels = round(cumbeta[c(20, 50, 200)], 2), cex = .8, col = "red")
segments(x0 = which.min(abs(0.9 - cumbeta)), y0 = 0, x1 = which.min(abs(0.9 - cumbeta)), y1 = 0.9, col = "blue")
segments(x0 = 0, y0 = 0.9, x1 = which.min(abs(0.9 - cumbeta)), y1 = 0.9, col = "blue")
text(which.min(abs(0.9 - cumbeta)), -0.0025, labels = which.min(abs(0.9 - cumbeta)), cex = .8, col = "blue")
text(0, 0.9+0.03, labels = 0.9, cex = .8, col = "blue") 
# stm100k
modexam <- stm100k
sortbeta <- t(apply(exp(modexam$beta$logbeta[[1]]), 1, sort, decreasing = TRUE))
avgsortbeta <- colMeans(sortbeta)
cumbeta <- cumsum(avgsortbeta)
plot(NULL, ylim=c(0, 1), xlim = c(0, 1700), main = "STM 100 topics (average)",
     xlab = "Top n words", ylab = "Mass covered", cex.lab = 1, axes = F)
axis(side = 1, at = seq(0, 1500, by = 500), col.ticks = NA, cex.axis = 1)
axis(side = 2, at = seq(0, 1, by = 0.2), col.ticks = NA, cex.axis = 1)
points(cumbeta[1:1500], type = "l")
segments(x0 = c(20, 50, 200), y0 = 0, x1 = c(20, 50, 200), y1 = cumbeta[c(20, 50, 200)], col = "red")
segments(x0 = 0, y0 = cumbeta[c(20, 50, 200)], x1 = c(20, 50, 200), y1 = cumbeta[c(20, 50, 200)], col = "red")
text(c(20, 50, 200), -0.0025, labels = c(20, 50, 200), cex = .8, col = "red")
text(0, round(cumbeta[c(20, 50, 200)], 2)+0.03, labels = round(cumbeta[c(20, 50, 200)], 2), cex = .8, col = "red")
segments(x0 = which.min(abs(0.9 - cumbeta)), y0 = 0, x1 = which.min(abs(0.9 - cumbeta)), y1 = 0.9, col = "blue")
segments(x0 = 0, y0 = 0.9, x1 = which.min(abs(0.9 - cumbeta)), y1 = 0.9, col = "blue")
text(which.min(abs(0.9 - cumbeta)), -0.0025, labels = which.min(abs(0.9 - cumbeta)), cex = .8, col = "blue")
text(0, 0.9+0.03, labels = 0.9, cex = .8, col = "blue") 
# stm500k
modexam <- stm500k
sortbeta <- t(apply(exp(modexam$beta$logbeta[[1]]), 1, sort, decreasing = TRUE))
avgsortbeta <- colMeans(sortbeta)
cumbeta <- cumsum(avgsortbeta)
plot(NULL, ylim=c(0, 1), xlim = c(0, 1700), main = "STM 500 topics (average)",
     xlab = "Top n words", ylab = "Mass covered", cex.lab = 1, axes = F)
axis(side = 1, at = seq(0, 1500, by = 500), col.ticks = NA, cex.axis = 1)
axis(side = 2, at = seq(0, 1, by = 0.2), col.ticks = NA, cex.axis = 1)
points(cumbeta[1:1500], type = "l")
segments(x0 = c(20, 50, 200), y0 = 0, x1 = c(20, 50, 200), y1 = cumbeta[c(20, 50, 200)], col = "red")
segments(x0 = 0, y0 = cumbeta[c(20, 50, 200)], x1 = c(20, 50, 200), y1 = cumbeta[c(20, 50, 200)], col = "red")
text(c(20, 50, 200), -0.0025, labels = c(20, 50, 200), cex = .8, col = "red")
text(0, round(cumbeta[c(20, 50, 200)], 2)+0.03, labels = round(cumbeta[c(20, 50, 200)], 2), cex = .8, col = "red")
segments(x0 = which.min(abs(0.9 - cumbeta)), y0 = 0, x1 = which.min(abs(0.9 - cumbeta)), y1 = 0.9, col = "blue")
segments(x0 = 0, y0 = 0.9, x1 = which.min(abs(0.9 - cumbeta)), y1 = 0.9, col = "blue")
text(which.min(abs(0.9 - cumbeta)), -0.0025, labels = which.min(abs(0.9 - cumbeta)), cex = .8, col = "blue")
text(0, 0.9+0.03, labels = 0.9, cex = .8, col = "blue") 
dev.off()

### replicate Figure SI28-32 (Word Cloud)
set.seed(123)
s <- sample(1:10, 6)
pdf("plots/figureSI28.pdf", width = 10, height = 7)
par(mfrow = c(2, 3), mar = c(2, 1, 2, 1))
cloud(stmobj = stm10k1it, topic = s[1], max.words = 60)
cloud(stmobj = stm10k1it, topic = s[2], max.words = 60)
cloud(stmobj = stm10k1it, topic = s[3], max.words = 60)
cloud(stmobj = stm10k1it, topic = s[4], max.words = 60)
cloud(stmobj = stm10k1it, topic = s[5], max.words = 60)
cloud(stmobj = stm10k1it, topic = s[6], max.words = 60)
dev.off()

s <- sample(1:10, 6)
pdf("plots/figureSI29.pdf", width = 10, height = 7)
par(mfrow = c(2, 3), mar = c(2, 1, 2, 1))
cloud(stmobj = stm10k, topic = s[1], max.words = 60)
cloud(stmobj = stm10k, topic = s[2], max.words = 60)
cloud(stmobj = stm10k, topic = s[3], max.words = 60)
cloud(stmobj = stm10k, topic = s[4], max.words = 60)
cloud(stmobj = stm10k, topic = s[5], max.words = 60)
cloud(stmobj = stm10k, topic = s[6], max.words = 60)
dev.off()

s <- sample(1:50, 6)
pdf("plots/figureSI30.pdf", width = 10, height = 7)
par(mfrow = c(2, 3), mar = c(2, 1, 2, 1))
cloud(stmobj = stm50k, topic = s[1], max.words = 60)
cloud(stmobj = stm50k, topic = s[2], max.words = 60)
cloud(stmobj = stm50k, topic = s[3], max.words = 60)
cloud(stmobj = stm50k, topic = s[4], max.words = 60)
cloud(stmobj = stm50k, topic = s[5], max.words = 60)
cloud(stmobj = stm50k, topic = s[6], max.words = 60)
dev.off()

s <- sample(1:100, 6)
pdf("plots/figureSI31.pdf", width = 10, height = 7)
par(mfrow = c(2, 3), mar = c(2, 1, 2, 1))
cloud(stmobj = stm100k, topic = s[1], max.words = 50)
cloud(stmobj = stm100k, topic = s[2], max.words = 50)
cloud(stmobj = stm100k, topic = s[3], max.words = 50)
cloud(stmobj = stm100k, topic = s[4], max.words = 50)
cloud(stmobj = stm100k, topic = s[5], max.words = 50)
cloud(stmobj = stm100k, topic = s[6], max.words = 50)
dev.off()

s <- sample(1:500, 6)
pdf("plots/figureSI32.pdf", width = 10, height = 7)
par(mfrow = c(2, 3), mar = c(2, 1, 2, 1))
cloud(stmobj = stm500k, topic = s[1], max.words = 50)
cloud(stmobj = stm500k, topic = s[2], max.words = 50)
cloud(stmobj = stm500k, topic = s[3], max.words = 50)
cloud(stmobj = stm500k, topic = s[4], max.words = 50)
cloud(stmobj = stm500k, topic = s[5], max.words = 50)
cloud(stmobj = stm500k, topic = s[6], max.words = 50)
dev.off()

# Note: The representative documents in Table SI8 are produced by the findThoughts()
# function from the stm package. We do not include the original text corpus in this
# replication file for copyright reasons.
